# Add Libraries 
library(readxl)
library(skimr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(flextable)
## 
## Attaching package: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose
library(lubridate)
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(RColorBrewer)

# Set the custom theme
theme_custom <- theme_minimal() +
  theme (
    plot.title = element_text(size = 14, hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    strip.text = element_text(size = 10),
    axis.text = element_text(size = 8),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    axis.line = element_line(color = "black"),
    axis.ticks = element_line(color = "black"),
    
  )

palette1 <- brewer.pal(8, "Set2")
palette2 <- brewer.pal(8, "Set3")

merged_palette <- c(palette1, palette2)

1. Read Data and Set Variables Types

# Read data and transform categorical variables to factors
data_sales <- read_excel("data/raw/Coffee_Shop_Sales.xlsx") %>%
  mutate(across(where(is.character), as.factor),
         across(c(transaction_id, store_id, product_id), as.factor)) %>%
  mutate(
    transaction_date_and_time = as.POSIXct(paste(as.Date(transaction_date), hms::as_hms(transaction_time)), format = "%Y-%m-%d %H:%M:%S")
  ) %>%
  relocate(transaction_date_and_time, .after = transaction_time)

# Take a look on data basic statistics
skim(data_sales)
Data summary
Name data_sales
Number of rows 149116
Number of columns 12
_______________________
Column type frequency:
factor 7
numeric 2
POSIXct 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
transaction_id 0 1 FALSE 149116 1: 1, 2: 1, 3: 1, 4: 1
store_id 0 1 FALSE 3 8: 50735, 3: 50599, 5: 47782
store_location 0 1 FALSE 3 Hel: 50735, Ast: 50599, Low: 47782
product_id 0 1 FALSE 80 71: 3076, 50: 3053, 59: 3029, 54: 3026
product_category 0 1 FALSE 9 Cof: 58416, Tea: 45449, Bak: 22796, Dri: 11468
product_type 0 1 FALSE 29 Bre: 17183, Gou: 16912, Bar: 16403, Hot: 11468
product_detail 0 1 FALSE 80 Cho: 3076, Ear: 3053, Dar: 3029, Mor: 3026

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
transaction_qty 0 1 1.44 0.54 1.0 1.0 1 2.00 8 ▇▁▁▁▁
unit_price 0 1 3.38 2.66 0.8 2.5 3 3.75 45 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
transaction_date 0 1 2023-01-01 00:00:00 2023-06-30 00:00:00 2023-04-24 00:00:00 181
transaction_time 0 1 1899-12-31 06:00:00 1899-12-31 20:59:32 1899-12-31 11:15:28 25762
transaction_date_and_time 0 1 2023-01-01 06:06:11 2023-06-30 18:57:19 2023-04-24 06:24:32 116129

2. Clean Data and Check for Outliers

# Check for missing data
missing_data <- data_sales %>%
  summarise(across(everything(), ~ sum(is.na(.))))

# Check for duplicates
duplicates <- data_sales %>%
  filter(duplicated(.))

# Detect outliers in unit_price within each product_id group
detect_outliers_by_product <- function(column) {
  mean_val <- mean(column, na.rm = TRUE)
  sd_val <- sd(column, na.rm = TRUE)
  
  # Define bounds using the three-sigma rule
  lower_bound <- mean_val - 3 * sd_val
  upper_bound <- mean_val + 3 * sd_val
  
  # Return outliers
  outliers <- column < lower_bound | column > upper_bound
  return(outliers)
}

# Apply outlier detection for unit_price by product_id
outliers_by_product_price <- data_sales %>%
  group_by(product_id) %>%
  mutate(price_outliers = detect_outliers_by_product(unit_price)) %>%

  # Filter for products that have price outliers
  filter(any(price_outliers)) %>%
  ungroup()

# Visualize the outliers
# Boxplot for unit_price by product_id
ggplot(outliers_by_product_price, aes(x = as.factor(product_id), y = unit_price)) +
  geom_boxplot(width = 0.9) +
  labs(title = "Boxplot of Unit Price by Product ID", x = "Product ID", y = "Unit Price") +
  coord_flip()+
  theme_custom 

Upon analysis, it was determined that outlier detection for transaction_qty is unnecessary, as the observed range (1–8) reflects typical and reasonable purchase quantities.

Similarly, no outliers were removed for unit_price, as all values, including price reductions of up to 50% (price_id = 9), are consistent with pricing strategies such as discounts or promotions.

Given the acceptability of the data, no further outlier treatment is required for either transaction_qty or unit_price, ensuring that the dataset remains representative of normal business operations.

3. Desriptive Statistics

3.1. Descriptive Statistics for Numerical Variables

# Function to calculate 95% confidence interval for the mean
ci_95 <- function(x) {
  n <- sum(!is.na(x))
  if (n < 3) return("NA")
  se <- sd(x, na.rm = TRUE) / sqrt(n)
  mean_x <- mean(x, na.rm = TRUE)
  ci <- c(mean_x - 1.96 * se, mean_x + 1.96 * se)
  paste0(round(ci[1], 2), " - ", round(ci[2], 2))
}

# List of descriptive statistics
statistics <- list(
  `_Number of values` = ~as.character(sum(!is.na(.x))),
  `_No data` = ~as.character(sum(is.na(.x))),
  `_Mean` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(mean(.x, na.rm = TRUE) %>% round(2))),
  `_Median` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(median(.x, na.rm = TRUE) %>% round(2))),
  `_SD` = ~ifelse(sum(!is.na(.x)) < 3, "NA", as.character(sd(.x, na.rm = TRUE) %>% round(2))),
  `_Q1 - Q3` = ~ifelse(sum(!is.na(.x)) == 0, "NA", paste0(as.character(quantile(.x, 0.25, na.rm = TRUE) %>% round(2)), " - ", as.character(quantile(.x, 0.75, na.rm = TRUE) %>% round(2)))),
  `_IQR` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(IQR(.x, na.rm = TRUE) %>% round(2))),
  `_95% CI` = ~ci_95(.x),
  `_min` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(min(.x, na.rm = TRUE) %>% round(2))),
  `_max` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(max(.x, na.rm = TRUE) %>% round(2)))
)

# Summarize the statistics for each numeric variable grouped by TenYearCHD
data_sales %>%
  select(where(is.numeric)) %>%
  summarise(across(everything(), statistics)) %>%
  pivot_longer(cols = everything(), names_sep = "__", names_to = c("Variable", "Stat")) %>%
  rename(`Value` = value) %>%
  flextable() %>%
  theme_zebra() %>%
  merge_v("Variable") %>%
  autofit()

Variable

Stat

Value

transaction_qty

Number of values

149116

No data

0

Mean

1.44

Median

1

SD

0.54

Q1 - Q3

1 - 2

IQR

1

95% CI

1.44 - 1.44

min

1

max

8

unit_price

Number of values

149116

No data

0

Mean

3.38

Median

3

SD

2.66

Q1 - Q3

2.5 - 3.75

IQR

1.25

95% CI

3.37 - 3.4

min

0.8

max

45

3.2. Descriptive Statistics for Categorical Variables

# Clean and summarize the categorical data for all factor variables
data_sales %>%
  select(-transaction_id, -product_id, -product_detail) %>%
  select(where(is.factor)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
  group_by(Variable, Value) %>%
  summarise(n = n(), .groups = 'drop') %>%
  group_by(Variable) %>%
  mutate(`No data` = sum(is.na(Value)),
         `% by group` = paste0(round((n / sum(n)) * 100, 2), " %") ) %>%
  ungroup() %>%
  select(Variable, Value, n, `% by group`, `No data`) %>%
  arrange(Variable, Value) %>%
  flextable() %>%
  theme_zebra() %>%
  merge_v("Variable") %>%
  autofit()

Variable

Value

n

% by group

No data

product_category

Bakery

22,796

15.29 %

0

Branded

747

0.5 %

0

Coffee

58,416

39.17 %

0

Coffee beans

1,753

1.18 %

0

Drinking Chocolate

11,468

7.69 %

0

Flavours

6,790

4.55 %

0

Loose Tea

1,210

0.81 %

0

Packaged Chocolate

487

0.33 %

0

Tea

45,449

30.48 %

0

product_type

Drinking Chocolate

266

0.18 %

0

Barista Espresso

16,403

11 %

0

Biscotti

5,711

3.83 %

0

Black tea

303

0.2 %

0

Brewed Black tea

11,350

7.61 %

0

Brewed Chai tea

17,183

11.52 %

0

Brewed Green tea

5,671

3.8 %

0

Brewed herbal tea

11,245

7.54 %

0

Chai tea

443

0.3 %

0

Clothing

221

0.15 %

0

Drip coffee

8,477

5.68 %

0

Espresso Beans

319

0.21 %

0

Gourmet Beans

366

0.25 %

0

Gourmet brewed coffee

16,912

11.34 %

0

Green beans

134

0.09 %

0

Green tea

159

0.11 %

0

Herbal tea

305

0.2 %

0

Hot chocolate

11,468

7.69 %

0

House blend Beans

183

0.12 %

0

Housewares

526

0.35 %

0

Organic Beans

415

0.28 %

0

Organic brewed coffee

8,489

5.69 %

0

Organic Chocolate

221

0.15 %

0

Pastry

6,912

4.64 %

0

Premium Beans

336

0.23 %

0

Premium brewed coffee

8,135

5.46 %

0

Regular syrup

4,979

3.34 %

0

Scone

10,173

6.82 %

0

Sugar free syrup

1,811

1.21 %

0

store_id

3

50,599

33.93 %

0

5

47,782

32.04 %

0

8

50,735

34.02 %

0

store_location

Astoria

50,599

33.93 %

0

Hell's Kitchen

50,735

34.02 %

0

Lower Manhattan

47,782

32.04 %

0

4. Data Analysis

4.1. Sales by Product

4.1.1. Sales by Product Category

# Aggregate the total sales by product category
sales_by_category <- data_sales %>%
  select(product_category, transaction_qty, unit_price) %>%
  group_by(product_category) %>%
  summarise(total_sales_qty = sum(transaction_qty, na.rm = TRUE),
            total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
  arrange(desc(total_sales_value))

# Create a custom function for generating the bar plot
plot_sales_categories <- function(data, y_var, title, y_label, y_breaks_by, label_prefix = "") {
  ggplot(data, aes(x = reorder(product_category, -get(y_var)), y = get(y_var), fill = product_category)) +
    geom_bar(stat = "identity") +
    geom_label(aes(label = paste(label_prefix, round(get(y_var)), sep = "")),
               fill = "white", 
               size = 3, 
               fontface = "bold", 
               label.size = 0.3,
               vjust = -0.2,
               position = position_dodge(width = 0.9)) +
    scale_y_continuous(labels = comma, 
                       breaks = seq(0, max(data[[y_var]]) * 1.2, by = y_breaks_by),
                       limits = c(0, max(data[[y_var]]) * 1.2)) +
    scale_fill_manual(values = merged_palette) +
    labs(title = title, 
         x = "Product Category", 
         y = y_label) +
    theme_custom +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none")
}

# Call the function for total sales quantity
plot_sales_categories(sales_by_category, 
              y_var = "total_sales_qty",
              title = "Total Sales Quantity by Product Category", 
              y_label = "Total Sales Quantity", 
              y_breaks_by = 10000, 
              label_prefix = "n = ")

# Call the function for total sales value
plot_sales_categories(sales_by_category, 
              y_var = "total_sales_value",
              title = "Total Sales Income by Product Category", 
              y_label = "Total Sales Income", 
              y_breaks_by = 25000, 
              label_prefix = "$")

# Summarize average sales per transaction by product category
avg_sales_per_transaction_category <- data_sales %>%
  group_by(product_category) %>%
  summarise(
    avg_sales_per_transaction = sum(transaction_qty * unit_price, na.rm = TRUE) / n()
  ) %>%
  arrange(desc(avg_sales_per_transaction))

# Plot the average sales per transaction by product category
ggplot(avg_sales_per_transaction_category, aes(x = reorder(product_category, avg_sales_per_transaction), y = avg_sales_per_transaction, fill = product_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = merged_palette) +
  labs(title = "Average Sales Per Transaction by Product Category", 
       x = "Product Category", 
       y = "Average Sales Per Transaction ($)") +
  geom_label(aes(label = paste0("$", comma(round(avg_sales_per_transaction, 2)))), 
             position = position_stack(vjust = 0.5), 
             size = 3, fill = "white") +
  theme_custom

Coffee beans and Branded products may be higher-priced items, leading to higher per-transaction sales even though they have fewer units sold. Coffee and Tea are likely lower-priced items but are sold in much larger quantities, contributing more significantly to the overall revenue, even though their average sales per transaction are lower.

Key Observations:

  1. Coffee is the top-performing product category in both quantity and value. It has the highest number of units sold and also generates the most revenue.

  2. Tea is the second most popular category, both in terms of the number of units sold and total revenue, though its sales figures are significantly lower than Coffee.

  3. Bakery and Drinking Chocolate show moderate sales in both quantity and value, ranking third and fourth in both metrics.

There’s a clear concentration of sales in a few major categories (Coffee, Tea, and Bakery), with the other categories contributing much smaller amounts.

4.1.2. Sales by Product Type

There are 29 distinct product types in the dataset. To streamline the analysis, I chose to focus on the product types that belong to the top-selling product categories. These top categories were identified as those contributing at least 10% of the total revenue. This threshold is flexible and can be adjusted as needed.

# Calculate total sales income across all categories
total_sales_income <- sum(sales_by_category$total_sales_value, na.rm = TRUE)

# Filter product categories that contribute at least 10% of the total income
significant_categories <- sales_by_category %>%
  filter(total_sales_value >= 0.1 * total_sales_income) %>%
  pull(product_category)

# Filter the data for the most valuable product_category
sales_by_type_significant <- data_sales %>%
  filter(product_category %in% significant_categories) %>%
  group_by(product_category, product_type) %>%
  summarise(
    total_sales_qty = sum(transaction_qty, na.rm = TRUE),
    total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE),
    count_transactions = n()  # Count of transactions for each product type
  ) %>%
  arrange(desc(total_sales_value))
## `summarise()` has grouped output by 'product_category'. You can override using
## the `.groups` argument.
# Function to create bar plots for product types within categories
plot_sales_categories <- function(data, x_var, y_var, fill_var, title, y_label, y_breaks_by, label_prefix = "") {
  ggplot(data, aes(x = reorder(get(x_var), -get(y_var)), y = get(y_var), fill = get(fill_var))) +
    geom_bar(stat = "identity", position = position_dodge2(width = 0.9)) +
    geom_text(aes(label = paste(label_prefix, round(get(y_var)), sep = ""), 
                  y = get(y_var) / 2),
              size = 3, 
              fontface = "bold", 
              colour = "white",
              position = position_dodge2(width = 0.9), 
              vjust = 0.5) +
    scale_y_continuous(labels = comma, 
                       breaks = seq(0, max(data[[y_var]]) * 1.2, by = y_breaks_by),
                       limits = c(0, max(data[[y_var]]) * 1.2)) +
    scale_fill_manual(values = merged_palette) +
    labs(title = title, 
         x = "Product Category", 
         y = y_label,
         fill = "Product Type") +
    coord_flip()+
    theme_custom +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "right")
}

# Call the function for total sales quantity
plot_sales_categories(sales_by_type_significant, 
                      x_var = "product_category", 
                      y_var = "total_sales_qty", 
                      fill_var = "product_type", 
                      title = "Total Sales Quantity by Product Category and Product Type", 
                      y_label = "Total Sales Quantity", 
                      y_breaks_by = 2500,
                      label_prefix = "n = ")

# Call the function for total sales value
plot_sales_categories(sales_by_type_significant, 
                      x_var = "product_category", 
                      y_var = "total_sales_value", 
                      fill_var = "product_type", 
                      title = "Total Sales Value by Product Category and Product Type", 
                      y_label = "Total Sales Value", 
                      y_breaks_by = 10000,
                      label_prefix = "$")

The analysis of sales by product type of the best selling categories shows:

The top three best-selling product types were Brewed Chai Tea (sold 26,250 times), Gourmet Brewed Coffee (sold 25,973 times), and Barista Espresso (sold 24,943 times). The top three product types generating the highest profit were Barista Espresso ($91,406), Brewed Chai Tea ($77,082), and Hot Chocolate ($72,416 from 17,457 sales).

It is important to note that the best-selling product type did not generate the highest profit, but still are in top three product types generating the highest profit. Let’s ensure there is correlation between products sold and total sales value.

# Create scatter plot showing the relationship between units sold and total sales value (profit)
ggplot(sales_by_type_significant, aes(x = total_sales_qty, y = total_sales_value)) +
  geom_point(aes(color = product_type), size = 4, alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", size = 0.5) +
  scale_color_manual(values = merged_palette) +
  labs(title = "Correlation Between Units Sold and Total Profit by Product Type",
       x = "Units Sold",
       y = "Total Sales Value ($)",
       color = "Product Type") +
  theme_custom +
  theme(
    legend.title = element_text(size = 8),
    legend.text = element_text(size = 6),
    legend.key.size = unit(0.1, "cm")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

The pink line represents the trend line (using linear regression), showing the general positive relationship between units sold and total sales value. This means that, as the number of units sold increases, total sales value generally increases as well. Since The correlation between sales and profit is positive, but the relationship isn’t perfect—some products deviate from the trend. No product types were discovered that had low sales but high profit, indicating that products that generate significant revenue generally also have high sales volumes.

Ensuring that all stores are regularly stocked with these high-performing products is crucial, as they play a significant role in both sales volume and revenue.

4.1.3. Sales By Product ID

There are 80 unique product IDs in the dataset, but for this analysis, I concentrated on the Top 10 items that contribute the most significant portion of the total profit or were best-sellers.

# Calculate the total profit for each product_id
sales_profit <- data_sales %>%
  group_by(product_id, product_detail) %>%
  summarise(
    total_sales_qty = sum(transaction_qty, na.rm = TRUE),
    total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  arrange(desc(total_sales_value))  # Sort by total sales value in descending order
## `summarise()` has grouped output by 'product_id'. You can override using the
## `.groups` argument.
# Select the top 10 items contributing the most profit 
top_10_products_sales <- sales_profit %>%
  slice_max(order_by = total_sales_value, n = 10)

# Select the top 10 items by quantity sold
top_10_products_sales <- sales_profit %>%
  arrange(desc(total_sales_qty)) %>%
  slice(1:10)

ggplot(top_10_products_sales, aes(x = reorder(product_id, total_sales_value), y = total_sales_value, fill = product_detail)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  geom_text(aes(label = paste("$", round(total_sales_value))),
              size = 3, 
              fontface = "bold", 
              colour = "white",
              position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = merged_palette) +
  labs(title = "Top 10 Products by Total Profit",
       x = "Product ID",
       y = "Total Sales Value ($)",
       fill = "Product Detail") +
  theme_custom +
  theme(
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 8),
    legend.key.size = unit(0.3, "cm")
  )

ggplot(top_10_products_sales, aes(x = reorder(product_id, total_sales_qty), y = total_sales_qty, fill = product_detail)) +
  geom_bar(stat = "identity") +
  coord_flip() +
   geom_text(aes(label = paste("n = ", round(total_sales_qty))),
              size = 3, 
              fontface = "bold", 
              colour = "white",
              position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = merged_palette) +
  labs(title = "Top 10 Products by Sales Quantity",
       x = "Product ID",
       y = "Total Quantity",
       fill = "Product Detail") +
  theme_custom +
  theme(
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 8),
    legend.key.size = unit(0.3, "cm")
  )

Since the previous analysis didn’t provide sufficient insights, I decided to focus on identifying which drink sizes were the best sellers within the coffee, tea, and drinking chocolate categories.

# Extract drink size from product_detail
sales_by_size <- data_sales %>%
  filter(product_category %in% c("Coffee", "Tea", "Drinking Chocolate")) %>%
  mutate(
    drink_size = case_when(
      grepl("Sm", product_detail) ~ "Small",
      grepl("Rg", product_detail) ~ "Regular",
      grepl("Lg", product_detail) ~ "Large",
      TRUE ~ "Unknown"
    )
  ) %>%
  group_by(product_category, drink_size) %>%
  summarise(total_sales_qty = sum(transaction_qty, na.rm = TRUE),
            total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
  arrange(desc(total_sales_qty))
## `summarise()` has grouped output by 'product_category'. You can override using
## the `.groups` argument.
# Plot the total sales by drink size for each category
ggplot(sales_by_size, aes(x = product_category, y = total_sales_qty, fill = drink_size)) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9)) +
  geom_label(aes(label = paste("n=", round(total_sales_qty), sep = ""), group = drink_size),
               fill = "white", 
               size = 2, 
               fontface = "bold", 
               label.size = 0.2,
               vjust = -0.5,
               position = position_dodge2(width = 0.9)) +
  scale_fill_manual(values = palette2) +
  scale_y_continuous(labels = comma, 
                       breaks = seq(0, max(sales_by_size$total_sales_qty) * 1.2, by = 5000),
                     limits = c(0, max(sales_by_size$total_sales_qty) * 1.2)) +
  labs(title = "Best-Selling Drink Sizes by Category", 
       x = "Product Category", 
       y = "Total Sales Quantity",
       fill = "Drink Size") +
  theme_custom

ggplot(sales_by_size, aes(x = product_category, y = total_sales_value, fill = drink_size)) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9)) +
  geom_label(aes(label = paste("$", round(total_sales_value), sep = ""), group = drink_size),
               fill = "white", 
               size = 2, 
               fontface = "bold", 
               label.size = 0.2,
               vjust = -0.5,
               position = position_dodge2(width = 0.9)) +
  scale_fill_manual(values = palette2) +
  scale_y_continuous(labels = comma, 
                       breaks = seq(0, max(sales_by_size$total_sales_value) * 1.2, by = 25000),
                     limits = c(0, max(sales_by_size$total_sales_value) * 1.2)) +
  labs(title = "Total Sales Value by Drink Size and Category", 
       x = "Product Category", 
       y = "Total Sales Value ($)",
       fill = "Drink Size") +
  theme_custom

This visualization helps highlight which drink sizes are most popular in each product category, with Large and Regular sizes standing out across all categories.

These insights suggest that stocking and promoting Large and Regular drinks should be a priority for maximizing both sales volume and profit in these categories. Efforts should be focused on ensuring these sizes are always available, especially in the Tea and Drinking Chocolate categories.

4.2. Sales by Store ID and Location

4.2.1. Compare Different Stores by Sales Value and Quantity Sold

# Summarize sales by store ID
sales_by_store <- data_sales %>%
  group_by(store_location, store_id) %>%
  summarise(
    total_sales_qty = sum(transaction_qty, na.rm = TRUE),
    total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
  ) %>%
  arrange(desc(total_sales_value)) %>%
  pivot_longer(cols = c(total_sales_qty, total_sales_value), 
               names_to = "sales_metric", 
               values_to = "sales_value")
## `summarise()` has grouped output by 'store_location'. You can override using
## the `.groups` argument.
# Plot both total sales quantity and value in a faceted plot
ggplot(sales_by_store, aes(x = reorder(store_location, sales_value), y = sales_value, fill = store_location)) +
  geom_bar(stat = "identity", position = position_dodge2(width = 0.9)) +
  geom_label(aes(label = comma(round(sales_value))),
             fill = "white",
             position = position_dodge2(width = 0.9), 
             vjust = -0.3,
             size = 3, 
             fontface = "bold") +
  scale_fill_manual(values = palette2) +
  scale_y_continuous(labels = comma,
                     breaks = seq(0, max(sales_by_store$sales_value) * 1.2, by = 50000),
                     limits = c(0, max(sales_by_store$sales_value) * 1.2)) +
  facet_wrap(~ sales_metric, scales = "free") +
  labs(title = "Total Sales Quantity and Value by Store Location", 
       x = "Store Location",
       y = "Sales Value or Quantity",
       fill = "Store Location") +
  theme_custom

4.2.2. Sales by Product Category for Each Store Location

sales_by_category_stores <- data_sales %>%
  group_by(store_location, product_category) %>%
  summarise(
    total_sales_qty = sum(transaction_qty, na.rm = TRUE),
    total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
  ) %>%
  arrange(store_location, desc(total_sales_value))
## `summarise()` has grouped output by 'store_location'. You can override using
## the `.groups` argument.
ggplot(sales_by_category_stores, aes(x = reorder(product_category, total_sales_value), y = total_sales_value, fill = product_category)) +
  geom_bar(stat = "identity", position = "dodge2") +
  facet_wrap(~ store_location) +
  scale_fill_manual(values = merged_palette) +
  scale_y_continuous(labels = comma,
                     breaks = seq(0, max(sales_by_category_stores$total_sales_value) * 1.2, by = 10000)) +
  labs(title = "Sales by Product Category and Store Location (Profit)",
       x = "Product Category",
       y = "Total Sales Value ($)",
       fill = "Product Category") +
  theme_custom +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(sales_by_category_stores, aes(x = reorder(product_category, total_sales_qty), y = total_sales_qty, fill = product_category)) +
  geom_bar(stat = "identity", position = "dodge2") +
  facet_wrap(~ store_location) +
  scale_fill_manual(values = merged_palette) +
  scale_y_continuous(labels = comma,
                     breaks = seq(0, max(sales_by_category_stores$total_sales_qty) * 1.2, by = 5000)) +
  labs(title = "Sales by Product Category and Store Location (Sold Items)",
       x = "Product Category",
       y = "Total Sales Value ($)",
       fill = "Product Category") +
  theme_custom +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

4.2.3. Average Sales Per Transaction by Store

# Calculate average sales per transaction by store
avg_sales_per_transaction_store <- data_sales %>%
  group_by(store_location) %>%
  summarise(
    avg_sales_per_transaction = sum(transaction_qty * unit_price, na.rm = TRUE) / n()
  ) %>%
  arrange(desc(avg_sales_per_transaction))

# Plot average sales per transaction by store
ggplot(avg_sales_per_transaction_store, aes(x = reorder(store_location, avg_sales_per_transaction), y = avg_sales_per_transaction)) +
  geom_segment(aes(x = reorder(store_location, avg_sales_per_transaction), xend = reorder(store_location, avg_sales_per_transaction),
                   y = 0, yend = avg_sales_per_transaction), color = "grey30") +
  geom_point(aes(color = store_location), size = 10, alpha = 0.2) +
  geom_point(aes(color = store_location), size = 6) +
  geom_label(aes(label = comma(round(avg_sales_per_transaction, 2))), 
             vjust = -0.5, 
             size = 4, fill = "white") +
  scale_color_manual(values = palette1) +
  scale_y_continuous(labels = comma,
                     breaks = seq(0, max(avg_sales_per_transaction_store$avg_sales_per_transaction) * 1.2, by = 0.5),
                     limits = c(0, max(avg_sales_per_transaction_store$avg_sales_per_transaction) * 1.2) ) +
  labs(title = "Average Sales per Transaction by Store Location",
       x = "Store Location",
       y = "Average Sales per Transaction ($)",
       color = "Store Location") +
  theme_custom

There is little variation in the quantity of products sold, total profit, and sales across different product categories among the three stores. Similarly, the average sales per transaction are quite consistent across the stores, indicating that customer spending patterns are generally similar.

4.5. Sales by Timeline

4.5.1. Sales Trend Over Time (Monthly, Weekly, Daily)

sales_by_month_category <- data_sales %>%
  mutate(
    month = floor_date(transaction_date_and_time, "month"),
    month_year = format(month, "%b-%Y")
  ) %>%
  group_by(month, month_year, product_category) %>%
  summarise(
    total_sales_qty = sum(transaction_qty, na.rm = TRUE),
    total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'month', 'month_year'. You can override
## using the `.groups` argument.
# Plot the sales trends by Month-Year
ggplot(sales_by_month_category, aes(x = reorder(month_year, month), y = total_sales_value, color = product_category, group = product_category)) +
  geom_line(size = 0.5) + 
  geom_point(size = 1) +
  scale_color_manual(values = merged_palette) +
  labs(title = "Sales Trend Over Time by Product Category", 
       x = "Month-Year", 
       y = "Total Sales Value ($)", 
       color = "Product Category") +
  theme_custom +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

# Group sales data by week and product category
sales_by_week_category <- data_sales %>%
  mutate(
    week = floor_date(transaction_date_and_time, "week"),
    week_year = format(week, "%U-%Y")
  ) %>%
  group_by(week, week_year, product_category) %>%
  summarise(
    total_sales_qty = sum(transaction_qty, na.rm = TRUE),
    total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'week', 'week_year'. You can override using
## the `.groups` argument.
# Plot the sales trends by Week-Year
ggplot(sales_by_week_category, aes(x = week_year, y = total_sales_value, color = product_category, group = product_category)) +
  geom_line(size = 0.5) +
  geom_point(size = 1) + 
  scale_color_manual(values = merged_palette) +
  labs(title = "Sales Trend Over Time by Product Category (Weekly)", 
       x = "Week-Year", 
       y = "Total Sales Value ($)", 
       color = "Product Category") +
  theme_custom +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

The graph shows that sales of Coffee and Tea products increase significantly from February to June. Sales in other product categories, such as Bakery, Drinking Chocolate, and Coffee beans, also rise over this period, though less sharply than Coffee and Tea. Other categories also experience growth, but more gradually.

The demand for Coffee and Tea remained steady in January, then started to rise from February through June. This pattern suggests that colder weather might have sustained demand early in the year, while warmer spring months encouraged even more consumption. Meanwhile, categories like Flavours and Packaged Chocolate maintained steady, year-round demand with minimal seasonal impact.

To confirm these patterns, analyzing data over multiple years or incorporating weather data could help determine if these trends repeat each year.

4.5.2. Sales by Day of the Week

# Group data by day of the week
sales_by_day_of_week <- data_sales %>%
  mutate(day_of_week = wday(transaction_date_and_time, label = TRUE, abbr = TRUE)) %>%
  group_by(day_of_week) %>%
  summarise(
    transaction_count = sum(transaction_qty, na.rm = TRUE),
    total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)
  )

# Calculate the scaling factor
scaling_factor <- max(sales_by_day_of_week$total_sales_value) / max(sales_by_day_of_week$transaction_count)

ggplot(sales_by_day_of_week, aes(x = day_of_week)) +
  geom_bar(aes(y = transaction_count, fill = day_of_week), stat = "identity", alpha = 0.7) +
  geom_line(aes(y = total_sales_value / scaling_factor, group = 1), 
            color = "darkblue", size = 1, linetype = "dashed") +
  geom_point(aes(y = total_sales_value / scaling_factor), 
             color = "darkblue", size = 3) +
  geom_text(aes(y = total_sales_value / scaling_factor, 
                label = paste0("$", comma(round(total_sales_value, 2)))), 
            vjust = -1, hjust = 0.5, color = "darkblue", size = 3) +
  scale_y_continuous(
    name = "Transaction Count",
    sec.axis = sec_axis(~ . * scaling_factor, name = "Total Sales Value ($)"),
    limits = c(0, max(sales_by_day_of_week$transaction_count) * 1.2)
  ) +
  scale_fill_manual(values = merged_palette) +
  labs(title = "Frequency of Transactions and Total Sales Income by Day of the Week",
       x = "Day of the Week") +
  theme_custom +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

Product sales and profit stayed steady across the week, with a small drop on weekends.

4.5.3. Sales by Time of Day

# Define time periods for time of day
data_sales <- data_sales %>%
  mutate(
    time_of_day = case_when(
      hour(transaction_date_and_time) >= 6 & hour(transaction_date_and_time) < 12 ~ "Morning",
      hour(transaction_date_and_time) >= 12 & hour(transaction_date_and_time) < 17 ~ "Afternoon",
      hour(transaction_date_and_time) >= 17 & hour(transaction_date_and_time) < 21 ~ "Evening"
    ),
    time_of_day = factor(time_of_day, levels = c("Morning", "Afternoon", "Evening"))
  )

# Summarize total sales quantity by time of day and product category
sales_by_time_of_day <- data_sales %>%
  group_by(time_of_day, product_category) %>%
  summarise(
    total_sales_qty = sum(transaction_qty, na.rm = TRUE)
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'time_of_day'. You can override using the
## `.groups` argument.
# Calculate total sales value by time of day across all product categories
total_sales_value_by_time <- data_sales %>%
  group_by(time_of_day) %>%
  summarise(total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
  ungroup()

# Calculate the scaling factor
scaling_factor <- max(total_sales_value_by_time$total_sales_value) / max(sales_by_time_of_day$total_sales_qty)

# Plot the sales by time of day with a single line for total sales value
ggplot() +
  geom_bar(data = sales_by_time_of_day, aes(x = time_of_day, y = total_sales_qty, fill = product_category), 
           stat = "identity", position = "stack", alpha = 0.7) +
  geom_line(data = total_sales_value_by_time, aes(x = time_of_day, y = total_sales_value / scaling_factor, group = 1), 
            color = "darkblue", size = 0.5, linetype = "dashed") +
  geom_point(data = total_sales_value_by_time,
             aes(x = time_of_day, y = total_sales_value / scaling_factor), 
             color = "darkblue",
             size = 2) +
  geom_text(data = total_sales_value_by_time, 
            aes(x = time_of_day, y = total_sales_value / scaling_factor, 
                label = paste0("$", comma(round(total_sales_value, 2)))), 
                vjust = -1.2,
                color = "darkblue",
                size = 3) +
  scale_y_continuous(name = "Total Products Sold",
                    sec.axis = sec_axis(~ . * scaling_factor, name = "Total Sales Value ($)")) +
  scale_fill_manual(values = merged_palette) +
  labs(title = "Sales Comparison by Time of Day and Product Category",
       x = "Time of Day",
       y = "Total Products Sold",
       fill = "Product Category") +
  theme_custom +
  theme(legend.position = "right")

The analysis of sales by time of day reveals that Morning hours drive the highest product sales and revenue, reaching $338,289. Afternoon sales contribute a moderate $204,721, while Evening hours show the lowest revenue at $105,803.

This trend suggests that Morning hours are peak shopping times, making it essential to ensure optimal product availability and staffing during these hours to maximize sales potential.

4.5.4. Sales Trend

# Monthly Sales Growth
monthly_sales <- data_sales %>%
  mutate(month = floor_date(transaction_date_and_time, "month")) %>%
  group_by(month) %>%
  summarise(total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
  arrange(month) %>%
  mutate(monthly_growth = (total_sales_value - lag(total_sales_value)) / lag(total_sales_value) * 100) # actual minus previous

# Calculate scaling factor for secondary y-axis
monthly_growth_scale <- max(monthly_sales$total_sales_value, na.rm = TRUE) / max(monthly_sales$monthly_growth, na.rm = TRUE)

# Plot Monthly Sales and Growth Rate
ggplot(monthly_sales, aes(x = month)) +
  geom_bar(aes(y = total_sales_value), stat = "identity", fill = "skyblue", alpha = 0.8) +
  geom_line(aes(y = monthly_growth * monthly_growth_scale), 
            color = "darkred", linetype = "dashed", size = 1) +
  geom_point(aes(y = monthly_growth * monthly_growth_scale), 
             color = "darkred", size = 2) +
  scale_y_continuous(
    name = "Total Sales Value ($)",
    sec.axis = sec_axis(~ . / monthly_growth_scale, name = "Monthly Growth Rate (%)"),
    limits = c(min(monthly_sales$monthly_growth), max(monthly_sales$total_sales_value) * 1.2 )
  ) +
  geom_label(aes(
    y = monthly_growth * monthly_growth_scale,
    label = paste0(round(monthly_growth, 2), "%")
  ),
  vjust = -0.5,
  size = 3) +
  labs(title = "Monthly Sales Growth or Decline",
       x = "Month",
       y = "Total Sales Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_label()`).

# Weekly Sales Growth
weekly_sales <- data_sales %>%
  mutate(week = floor_date(transaction_date_and_time, "week")) %>%
  group_by(week) %>%
  summarise(total_sales_value = sum(transaction_qty * unit_price, na.rm = TRUE)) %>%
  arrange(week) %>%
  mutate(weekly_growth = (total_sales_value - lag(total_sales_value)) / lag(total_sales_value) * 100)

# Calculate scaling factor for weekly growth
weekly_growth_scale <- max(weekly_sales$total_sales_value, na.rm = TRUE) / max(weekly_sales$weekly_growth, na.rm = TRUE)

# Plot Weekly Sales and Growth Rate
ggplot(weekly_sales, aes(x = week)) +
  geom_bar(aes(y = total_sales_value), stat = "identity", fill = "pink", alpha = 0.8) +
  geom_line(aes(y = weekly_growth * weekly_growth_scale), 
            color = "darkblue", linetype = "dashed", size = 1) +
  geom_point(aes(y = weekly_growth * weekly_growth_scale), 
             color = "darkblue", size = 2) +
  scale_y_continuous(
    name = "Total Sales Value ($)",
    sec.axis = sec_axis(~ . / weekly_growth_scale, name = "Weekly Growth Rate (%)"),
    limits = c(min(weekly_sales$weekly_growth), max(weekly_sales$total_sales_value) * 1.2)
  ) +
  geom_label(aes(
    y = weekly_growth * weekly_growth_scale,
    label = paste0(round(weekly_growth, 2), "%")
  ),
  vjust = -0.2,
  size = 3) +
  labs(title = "Weekly Sales Growth or Decline",
       x = "Week",
       y = "Total Sales Value") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_label()`).

The strongest sales growth occurs from March to May, signaling a potentially seasonal or trend-based increase in demand during this period. Although June maintains high sales, the growth rate begins to level off, suggesting a plateau.

The weekly growth rate shows high variability, with some weeks experiencing sharp growth while others show significant declines. This may indicate a pattern of inconsistent demand or varying promotional strategies.